#import usual libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm

#import library to change country names from abbreviations to the full name.
!pip install country_converter --upgrade

#import library for heat map
!pip install geopandas
import geopandas as gpd

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: country_converter in /usr/local/lib/python3.9/dist-packages (1.0.0)
Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.9/dist-packages (from country_converter) (1.4.4)
Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0->country_converter) (1.22.4)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0->country_converter) (2022.7.1)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0->country_converter) (2.8.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.8.1->pandas>=1.0->country_converter) (1.16.0)
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: geopandas in /usr/local/lib/python3.9/dist-packages (0.12.2)
Requirement already satisfied: pyproj>=2.6.1.post1 in /usr/local/lib/python3.9/dist-packages (from geopandas) (3.5.0)
Requirement already satisfied: pandas>=1.0.0 in /usr/local/lib/python3.9/dist-packages (from geopandas) (1.4.4)
Requirement already satisfied: shapely>=1.7 in /usr/local/lib/python3.9/dist-packages (from geopandas) (2.0.1)
Requirement already satisfied: fiona>=1.8 in /usr/local/lib/python3.9/dist-packages (from geopandas) (1.9.2)
Requirement already satisfied: packaging in /usr/local/lib/python3.9/dist-packages (from geopandas) (23.0)
Requirement already satisfied: cligj>=0.5 in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (0.7.2)
Requirement already satisfied: munch>=2.3.2 in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (2.5.0)
Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (6.1.0)
Requirement already satisfied: click-plugins>=1.0 in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (1.1.1)
Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (22.2.0)
Requirement already satisfied: certifi in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (2022.12.7)
Requirement already satisfied: click~=8.0 in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (8.1.3)
Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0.0->geopandas) (1.22.4)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0.0->geopandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0.0->geopandas) (2022.7.1)
Requirement already satisfied: six in /usr/local/lib/python3.9/dist-packages (from munch>=2.3.2->fiona>=1.8->geopandas) (1.16.0)
Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.9/dist-packages (from importlib-metadata->fiona>=1.8->geopandas) (3.15.0)


#union density dataset
union = pd.read_csv('https://raw.githubusercontent.com/JulieChandlerDS/ILO_union_density/main/data-SmplN.csv')
#world happiness report 2019 (chosen because the union dataset has the most entries for 2019)
happiness = pd.read_csv('https://raw.githubusercontent.com/JulieChandlerDS/world-happiness-report/main/2019.csv')


#only use 2019 data since that's the most we have
union = union.loc[union['time']==2019.0]


#remove unneccesary columns from union index, we just need the obs value and the name of the country
union = union.drop(['time', 'area_time', 'Column 1','Column 1.1','Column 1.2'], axis = 1)


#do the same for happiness
happiness = happiness.drop(['Overall rank', 'GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption'], axis = 1)


#centering data
union_mean = union['obs_value'].mean()
#happiness centering
happiness_mean = happiness['Score'].mean()


#centering data
union['obs_value'] = union['obs_value'] - union['obs_value'].mean()
happiness['Score'] = happiness['Score'] - happiness['Score'].mean()


#Scaling data for happiness 
happiness['Score'] = (happiness['Score']-happiness['Score'].mean())/happiness['Score'].std()
#Scaling data for unions 
union['obs_value'] = (union['obs_value']-union['obs_value'].mean())/union['obs_value'].std()


happiness


import country_converter as coco
happiness['Country or region'] = coco.convert(names=happiness['Country or region'], to='ISO3')
happiness


happiness.rename(columns = {'Country or region':'ref_area'}, inplace = True)


#combine datasets
df = happiness.merge(union, how='left')
df

df


df.rename(columns={'Score':'Happiness'}, inplace=True)
df.rename(columns={'obs_value':'Union Density'}, inplace=True)
df.rename(columns={'ref_area':'Country'}, inplace=True)
df = df.dropna()
df


#scatter plot
plt.style.use('fivethirtyeight')
plt.scatter(df['Happiness'], df['Union Density'])
plt.title("correlation between happiness and union density")
plt.xlabel("happiness")
plt.ylabel("union density")
plt.show();


plt.hist(df['Happiness'])
plt.show()


plt.hist(df['Union Density'])
plt.show()


import seaborn as sns
# Extract relevant columns from the dataframe
corr_df = df[['Union Density', 'Happiness']]

# Compute the correlation matrix
corr_matrix = corr_df.corr()

# Plot the heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)

# Add title and axis labels
plt.title('Correlation Heatmap of Union Density and GDP per capita')
plt.xlabel('Variable')
plt.ylabel('Variable')

# Show the plot
plt.show()


y_var_name = 'GDP_PCAP_GWTH_PCNT'
X_var_names = ['GCF_GWTH_PCNT']


#linear regression
import seaborn as sns
plt.style.use('fivethirtyeight')
plt.title("correlation between happiness and union density")
plt.xlabel("happiness")
plt.ylabel("union density")
sns.regplot(data=df, y= df['Union Density'], x = df['Happiness']);


#pooled linear regression
import seaborn as sns
plt.style.use('fivethirtyeight')
plt.title("correlation between happiness and union density")
plt.xlabel("happiness")
plt.ylabel("union density")
sns.regplot(data=df, y= df['Union Density'], x = df['Happiness']);


from sklearn.cluster import KMeans

# Extract relevant columns from the dataframe
X = df[['Union Density', 'Happiness']]

# Define number of clusters
k = 4

# Fit KMeans algorithm to the data
kmeans = KMeans(n_clusters=k, random_state=0).fit(X)

# Add a new column to the dataframe with the cluster labels
df['Cluster'] = kmeans.labels_

# Plot the clusters
plt.scatter(df['Union Density'], df['Happiness'], c=df['Cluster'])
plt.xlabel('Union Density')
plt.ylabel('GDP per capita')
plt.title('K-means clustering of union density and GDP per capita')
plt.show()

/usr/local/lib/python3.9/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
<ipython-input-23-be26cc17b93a>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Cluster'] = kmeans.labels_


#OLS for union density vs happiness
np.random.seed(9876789)

X = df['Happiness']
y = df['Union Density']

model = sm.OLS(y, X, missing='drop')
results = model.fit()
results.summary()


# Define X and y variables 

y_var_name = df['Union Density']
X_var_names = df['Happiness']


# Carve out y variable
pooled_y= y_var_name


# Carve out X variable
pooled_X= X_var_names


# Add the placeholder for the regression intercept. When the model is fitted, the coefficient of this variable is the regression model’s intercept β_0.
pooled_X = sm.add_constant(pooled_X)


#Build the OLS regression model:

pooled_olsr_model = sm.OLS(endog=pooled_y, exog=pooled_X)


#Train the model on the (y, X) data set and fetch the training results:

pooled_olsr_model_results = pooled_olsr_model.fit()


#Print the training summary:
print(pooled_olsr_model_results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:          Union Density   R-squared:                       0.197
Model:                            OLS   Adj. R-squared:                  0.182
Method:                 Least Squares   F-statistic:                     12.78
Date:                Thu, 06 Apr 2023   Prob (F-statistic):           0.000765
Time:                        00:21:10   Log-Likelihood:                -70.391
No. Observations:                  54   AIC:                             144.8
Df Residuals:                      52   BIC:                             148.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0897      0.127     -0.707      0.483      -0.344       0.165
Happiness      0.4216      0.118      3.575      0.001       0.185       0.658
==============================================================================
Omnibus:                       14.562   Durbin-Watson:                   1.596
Prob(Omnibus):                  0.001   Jarque-Bera (JB):               16.366
Skew:                           1.119   Prob(JB):                     0.000279
Kurtosis:                       4.506   Cond. No.                         1.28
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


np.isnan(X).any()
np.isnan(y).any()

np.isinf(X).any()
np.isinf(y).any()

False


X = np.nan_to_num(X)
y = np.nan_to_num(y)


sp.stats.pearsonr(X, y)

PearsonRResult(statistic=0.44422304265859053, pvalue=0.0007654853445457307)

	Country or region	Score
0	Finland	2.121877
1	Denmark	1.970052
2	Norway	1.928727
3	Iceland	1.874824
4	Netherlands	1.869434
...	...	...
151	Rwanda	-1.862420
152	Tanzania	-1.954952
153	Afghanistan	-1.980107
154	Central African Republic	-2.087912
155	South Sudan	-2.294538

	Country or region	Score
0	FIN	2.121877
1	DNK	1.970052
2	NOR	1.928727
3	ISL	1.874824
4	NLD	1.869434
...	...	...
151	RWA	-1.862420
152	TZA	-1.954952
153	AFG	-1.980107
154	CAF	-2.087912
155	SSD	-2.294538

	ref_area	Score	obs_value
0	FIN	2.121877	2.089948
1	DNK	1.970052	2.544855
2	NOR	1.928727	1.623946
3	ISL	1.874824	3.898480
4	NLD	1.869434	-0.317729
...	...	...	...
151	RWA	-1.862420	-0.844755
152	TZA	-1.954952	NaN
153	AFG	-1.980107	-0.240062
154	CAF	-2.087912	NaN
155	SSD	-2.294538	NaN

	ref_area	Score	obs_value
0	FIN	2.121877	2.089948
1	DNK	1.970052	2.544855
2	NOR	1.928727	1.623946
3	ISL	1.874824	3.898480
4	NLD	1.869434	-0.317729
...	...	...	...
151	RWA	-1.862420	-0.844755
152	TZA	-1.954952	NaN
153	AFG	-1.980107	-0.240062
154	CAF	-2.087912	NaN
155	SSD	-2.294538	NaN

	Country	Happiness	Union Density
0	FIN	2.121877	2.089948
1	DNK	1.970052	2.544855
2	NOR	1.928727	1.623946
3	ISL	1.874824	3.898480
4	NLD	1.869434	-0.317729
6	SWE	1.739169	2.444997
9	AUT	1.652027	0.281417
11	CRI	1.581055	-0.034799
13	LUX	1.511880	0.392370
14	GBR	1.479539	0.126083
16	DEU	1.417551	-0.267800
17	BEL	1.361851	1.551827
26	GTM	0.924342	-1.000089
29	ESP	0.850676	-0.484158
30	PAN	0.821029	0.286964
31	BRA	0.802163	-0.450872
33	SGP	0.768025	0.059511
35	ITA	0.732988	0.630918
36	BHR	0.711427	-1.027827
37	SVK	0.710529	-0.556277
41	LTU	0.666508	-0.761540
42	COL	0.644947	-0.911326
51	THA	0.539837	-0.988993
52	LVA	0.478748	-0.584015
54	EST	0.436524	-0.839207
57	JPN	0.430236	-0.240062
60	BOL	0.334109	-0.506348
61	HUN	0.315244	-0.689421
62	PRY	0.301768	-0.789278
70	MDA	0.109515	-0.140204
76	DOM	0.016084	-0.755992
78	TUR	-0.030631	-0.622849
82	MNG	-0.109688	0.536608
83	MKD	-0.119570	-0.245609
88	MAR	-0.178863	-0.567373
91	IDN	-0.193237	-0.450872
97	GHA	-0.369319	-0.240062
101	BEN	-0.470835	0.592085
105	ZAF	-0.615474	0.442298
109	PSE	-0.638832	0.009582
110	SEN	-0.652307	-0.761540
111	SOM	-0.663986	-0.517444
115	ARM	-0.761909	-0.034799
118	GEO	-0.797844	-0.179038
120	KEN	-0.806828	-0.633944
123	TUN	-0.849950	0.941586
129	LKA	-0.935296	-0.611754
132	UKR	-0.965840	0.858372
133	ETH	-1.007166	-0.650587
137	ZMB	-1.167975	0.153821
138	TGO	-1.187739	0.148273
143	LSO	-1.441980	-0.905779
151	RWA	-1.862420	-0.844755
153	AFG	-1.980107	-0.240062

Data Cleaning¶

Visualization¶

Scatter Plot¶

Histograms¶

Heat Map¶

Machine learning¶

Pooled Linear regression¶

K-means clustering¶

Correlation or causation¶

Pooled OLS¶

Pearson Correlation¶

Dep. Variable:	Union Density	R-squared (uncentered):	0.190
Model:	OLS	Adj. R-squared (uncentered):	0.175
Method:	Least Squares	F-statistic:	12.42
Date:	Thu, 06 Apr 2023	Prob (F-statistic):	0.000886
Time:	00:21:10	Log-Likelihood:	-70.649
No. Observations:	54	AIC:	143.3
Df Residuals:	53	BIC:	145.3
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Happiness	0.4025	0.114	3.524	0.001	0.173	0.632

Omnibus:	15.667	Durbin-Watson:	1.581
Prob(Omnibus):	0.000	Jarque-Bera (JB):	18.327
Skew:	1.166	Prob(JB):	0.000105
Kurtosis:	4.646	Cond. No.	1.00